The zinb model doesn’t seem to work well in the Fluidigm data. This is independent of normalization, meaning that both unnormalized data and normalized data (both with scran and full-quantile) lead to bad results.
PCA leads to better results, even though, as many people have shown, the first component is highly correlated to detection rate, even after normalization.
ZIFA seems to work well in the Fluidigm datasets, suggesting that we should be able to find a way to make zinb work as well.
One key difference between ZIFA and zinb is that in our model both \(\mu\) and \(\pi\) depend on \(W\), while in the ZIFA model only \(\mu\) does. We should consider a model that doesn’t have a \(W\) in \(\pi\).
It seems (preliminary analyses) that removing the intercept in V makes the weird W behavior go away. Still, I think that we should add the ability to remove W from \(\pi\) and see what is the difference.
Select high coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("fluidigm")
fluidigm_high <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="High")]
filter <- apply(assay(fluidigm_high)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 6998
Fit a zinb model on the high coverage data (no normalization).
fluidigm_high <- fluidigm_high[filter,]
high <- assay(fluidigm_high)
set.seed(23424)
print(system.time(zinb_high <- zinbFit(high, ncores = 3, K = 2)))
## user system elapsed
## 670.571 72.622 296.439
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_high)$Biological_Condition)
cl <- as.factor(colData(fluidigm_high)$Cluster2)
plot(zinb_high@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_high@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(high)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_high@W[,1], colSums(high>0),method="spearman")
## [1] -0.2168706
#second factor and the total number of detected genes in the cell
cor(zinb_high@W[,2], colSums(high>0),method="spearman")
## [1] 0.5244318
#first factor and the total number of counts in the cell
cor(zinb_high@W[,1], colSums(high),method="spearman")
## [1] -0.1518794
#second factor and the total number of counts in the cell
cor(zinb_high@W[,2], colSums(high),method="spearman")
## [1] 0.3468531
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(high>0), method="spearman")
## [1] 0.9232517
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(high>0), method="spearman")
## [1] 0.297028
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(high), method="spearman")
## [1] 0.1270105
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(high), method="spearman")
## [1] -0.3141171
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_high@W[,1], method="spearman")
## [1] -0.2783654
cor(pca$x[,2], zinb_high@W[,1], method="spearman")
## [1] -0.01188811
cor(pca$x[,1], zinb_high@W[,2], method="spearman")
## [1] 0.7553322
cor(pca$x[,2], zinb_high@W[,2], method="spearman")
## [1] -0.4728147
sce <- newSCESet(countData=data.frame(high))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
fq <- betweenLaneNormalization(high, which="full")
plotRLE(high, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
plotRLE(fq, outline=FALSE, col=cols[bio], main="FQ normalization")
I’m not convinced that this is the best normalization, but since it’s the only one specifically proposed for scRNA-seq, it’s a good place to start.
set.seed(3525)
offsets <- matrix(rep(sf, NROW(high)), ncol=NROW(high), nrow=NCOL(high))
print(system.time(zinb_norm <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 748.390 80.517 318.133
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_high)[,metadata(fluidigm_high)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(high>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(high>0),method="spearman")
## [1] -0.2240385
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(high>0),method="spearman")
## [1] 0.4842657
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(high),method="spearman")
## [1] -0.1604021
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(high),method="spearman")
## [1] 0.3326923
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(high>0), method="spearman")
## [1] 0.9566434
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(high>0), method="spearman")
## [1] -0.1056818
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(high), method="spearman")
## [1] -0.07137238
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(high), method="spearman")
## [1] 0.4917832
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] -0.2262675
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.004195804
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.6608392
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.6752622
se <- newSeqExpressionSet(high)
se <- betweenLaneNormalization(se, which="full", offset=TRUE)
offsets <- t(offst(se))
set.seed(35235)
print(system.time(zinb_fq <- zinbFit(high, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 841.972 87.240 353.576
Plot the results with cells colored according to their biological condition.
plot(zinb_fq@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_fq@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_fq@W[,1], W2=zinb_fq@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_fq <- prcomp(t(log1p(fq)))
plot(pca_fq$x, col=cols[bio], pch=19)
legend("topright", levels(bio), fill=cols)
plot(pca_fq$x, col=cols2[cl], pch=19)
legend("topright", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_fq@W[,1], colSums(high>0),method="spearman")
## [1] 0.08378497
#second factor and the total number of detected genes in the cell
cor(zinb_fq@W[,2], colSums(high>0),method="spearman")
## [1] 0.3754808
#first factor and the total number of counts in the cell
cor(zinb_fq@W[,1], colSums(high),method="spearman")
## [1] -0.1039336
#second factor and the total number of counts in the cell
cor(zinb_fq@W[,2], colSums(high),method="spearman")
## [1] 0.3515297
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_fq$x[,1], colSums(high>0), method="spearman")
## [1] 0.7178322
#second factor and the total number of detected genes in the cell
cor(pca_fq$x[,2], colSums(high>0), method="spearman")
## [1] 0.6338724
#first factor and the total number of counts in the cell
cor(pca_fq$x[,1], colSums(high), method="spearman")
## [1] 0.2144231
#second factor and the total number of counts in the cell
cor(pca_fq$x[,2], colSums(high), method="spearman")
## [1] -0.2588287
Correlation between PCA and ZINB
cor(pca_fq$x[,1], zinb_fq@W[,1], method="spearman")
## [1] -0.002097902
cor(pca_fq$x[,2], zinb_fq@W[,1], method="spearman")
## [1] 0.1466783
cor(pca_fq$x[,1], zinb_fq@W[,2], method="spearman")
## [1] 0.7437937
cor(pca_fq$x[,2], zinb_fq@W[,2], method="spearman")
## [1] -0.1988199
## write matrices to file to feed to ZIFA in python
write.csv(log1p(high), file="logcounts_high.csv")
write.csv(log1p(norm), file="logcounts_high_scran.csv")
Select low coverage cells, filter out the genes that do not have at least 10 counts in at least 10 cells.
fluidigm_low <- fluidigm[,which(colData(fluidigm)$Coverage_Type=="Low")]
filter <- apply(assay(fluidigm_low)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 6998
Fit a zinb model on the low coverage data (no normalization).
fluidigm_low <- fluidigm_low[filter,]
low <- assay(fluidigm_low)
set.seed(654)
print(system.time(zinb_low <- zinbFit(low, ncores = 3, K = 2)))
## user system elapsed
## 379.503 60.009 163.815
Plot the results with cells colored according to their biological condition.
bio <- as.factor(colData(fluidigm_low)$Biological_Condition)
cl <- as.factor(colData(fluidigm_low)$Cluster2)
plot(zinb_low@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_low@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
Compared to PCA (of the log counts)
pca <- prcomp(t(log1p(low)))
plot(pca$x, col=cols[bio], pch=19)
legend("topleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_low@W[,1], colSums(low>0),method="spearman")
## [1] 0.4277894
#second factor and the total number of detected genes in the cell
cor(zinb_low@W[,2], colSums(low>0),method="spearman")
## [1] -0.5360534
#first factor and the total number of counts in the cell
cor(zinb_low@W[,1], colSums(low),method="spearman")
## [1] 0.0005681818
#second factor and the total number of counts in the cell
cor(zinb_low@W[,2], colSums(low),method="spearman")
## [1] 0.2904283
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca$x[,1], colSums(low>0), method="spearman")
## [1] -0.6268781
#second factor and the total number of detected genes in the cell
cor(pca$x[,2], colSums(low>0), method="spearman")
## [1] -0.03942437
#first factor and the total number of counts in the cell
cor(pca$x[,1], colSums(low), method="spearman")
## [1] 0.06979895
#second factor and the total number of counts in the cell
cor(pca$x[,2], colSums(low), method="spearman")
## [1] -0.474257
Correlation between PCA and ZINB
cor(pca$x[,1], zinb_low@W[,1], method="spearman")
## [1] -0.8020979
cor(pca$x[,2], zinb_low@W[,1], method="spearman")
## [1] -0.06770105
cor(pca$x[,1], zinb_low@W[,2], method="spearman")
## [1] 0.8125874
cor(pca$x[,2], zinb_low@W[,2], method="spearman")
## [1] -0.4075612
sce <- newSCESet(countData=data.frame(low))
sce <- computeSumFactors(sce, sizes=c(10, 20, 30))
sf <- sizeFactors(sce)
norm <- exprs(convertTo(sce, type="monocle"))
plotRLE(low, outline=FALSE, col=cols[bio], main="Unnormalized counts")
plotRLE(norm, outline=FALSE, col=cols[bio], main="SCRAN normalization")
set.seed(3525)
offsets <- matrix(rep(sf, NROW(low)), ncol=NROW(low), nrow=NCOL(low))
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, O_mu = offsets)))
## user system elapsed
## 266.052 45.152 117.459
Plot the results with cells colored according to their biological condition.
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
qc <- as.matrix(colData(fluidigm_low)[,metadata(fluidigm_low)$which_qc])
qcpca <- prcomp(qc, scale=TRUE)
quality <- qcpca$x[,1]
detection_rate <- colSums(low>0)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
Compared to PCA (of the log counts)
pca_norm <- prcomp(t(log1p(norm)))
plot(pca_norm$x, col=cols[bio], pch=19)
legend("bottomleft", levels(bio), fill=cols)
plot(pca_norm$x, col=cols2[cl], pch=19)
legend("topleft", levels(cl), fill=cols2)
Check the correlations between two zinb factors, the total numbers of reads and the total numbers of detected genes.
#first factor and total number of detected genes in the cell
cor(zinb_norm@W[,1], colSums(low>0),method="spearman")
## [1] 0.02917491
#second factor and the total number of detected genes in the cell
cor(zinb_norm@W[,2], colSums(low>0),method="spearman")
## [1] -0.5416043
#first factor and the total number of counts in the cell
cor(zinb_norm@W[,1], colSums(low),method="spearman")
## [1] -0.03199301
#second factor and the total number of counts in the cell
cor(zinb_norm@W[,2], colSums(low),method="spearman")
## [1] 0.1593969
Same for PCA
#first factor and total number of detected genes in the cell
cor(pca_norm$x[,1], colSums(low>0), method="spearman")
## [1] -0.8312772
#second factor and the total number of detected genes in the cell
cor(pca_norm$x[,2], colSums(low>0), method="spearman")
## [1] 0.5694024
#first factor and the total number of counts in the cell
cor(pca_norm$x[,1], colSums(low), method="spearman")
## [1] 0.4491259
#second factor and the total number of counts in the cell
cor(pca_norm$x[,2], colSums(low), method="spearman")
## [1] -0.5332168
Correlation between PCA and ZINB
cor(pca_norm$x[,1], zinb_norm@W[,1], method="spearman")
## [1] 0.2093969
cor(pca_norm$x[,2], zinb_norm@W[,1], method="spearman")
## [1] 0.4050699
cor(pca_norm$x[,1], zinb_norm@W[,2], method="spearman")
## [1] 0.859965
cor(pca_norm$x[,2], zinb_norm@W[,2], method="spearman")
## [1] 0.1836538
x <- model.matrix(~scale(detection_rate))
set.seed(9948)
print(system.time(zinb_norm <- zinbFit(low, ncores = 3, K = 2, X = x, which_X_mu=1:2, which_X_pi=1L)))
## user system elapsed
## 381.644 47.689 170.539
plot(zinb_norm@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
plot(zinb_norm@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(cl), fill=cols2)
data.frame(W1=zinb_norm@W[,1], W2=zinb_norm@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=detection_rate)) + scale_colour_gradient(low="blue", high="yellow") + theme_classic()
## write matrices to file to feed to ZIFA in python
write.csv(log1p(low), file="logcounts_low.csv")
write.csv(log1p(norm), file="logcounts_low_scran.csv")
write.csv(log1p(low[1:5, 1:3]), file="logcounts_toy.csv")
zifa_res <- read.csv("zifa_low.csv", header=FALSE)
plot(zifa_res, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("topleft", levels(bio), fill=cols)
library(biomaRt)
mart <- useMart("ensembl")
mart <- useDataset("hsapiens_gene_ensembl", mart = mart)
attrs <- c("hgnc_symbol", "entrezgene")
bm <- getBM(attributes=attrs, mart = mart)
bm <- bm[match(rownames(low), bm[,1]),]
bm <- na.omit(bm)
low <- low[bm[,1],]
gene_info <- getGeneLengthAndGCContent(as.character(bm[,2]), "hsa", mode="org.db")
rownames(gene_info) <- bm[,1]
gene_info <- na.omit(gene_info)
low <- low[rownames(gene_info),]
NB: currently, there is no way to have no columns of X or V in either \(\mu\) or \(\pi\). We should let the user specify NULL?
V <- cbind(rep(0, NROW(low)), gene_info)
print(system.time(zinb_gc <- zinbFit(low, ncores = 3, K = 2, V = V, which_V_mu=1L, which_V_pi=2:3)))
## user system elapsed
## 194.554 23.157 88.511
plot(zinb_gc@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_gc@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)
print(system.time(zinb_nov <- zinbFit(low, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(low)))))
## user system elapsed
## 300.793 56.159 143.995
plot(zinb_nov@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(bio), fill=cols)
plot(zinb_nov@W, col=cols2[cl], pch=19, xlab="W1", ylab="W2")
legend("bottomright", levels(cl), fill=cols2)
V <- cbind(rep(0, NROW(low)), rep(1, NROW(low)))
X <- cbind(rep(0, NCOL(low)), rep(1, NCOL(low)))
print(system.time(zinb_vx <- zinbFit(low, ncores = 3, K = 2, V=V, X=X, which_V_mu=1L, which_V_pi=2L, which_X_mu=2L, which_X_pi=1L)))
## user system elapsed
## 280.548 36.245 126.151
plot(zinb_vx@W, col=cols[bio], pch=19, xlab="W1", ylab="W2")
legend("bottomleft", levels(bio), fill=cols)